!pip install folium
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/ Requirement already satisfied: folium in /usr/local/lib/python3.10/dist-packages (0.14.0) Requirement already satisfied: branca>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from folium) (0.6.0) Requirement already satisfied: jinja2>=2.9 in /usr/local/lib/python3.10/dist-packages (from folium) (3.1.2) Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from folium) (1.22.4) Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from folium) (2.27.1) Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2>=2.9->folium) (2.1.2) Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->folium) (1.26.15) Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->folium) (2022.12.7) Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->folium) (2.0.12) Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->folium) (3.4)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import folium
from folium.plugins import HeatMap
from google.colab import files
uploaded = files.upload()
Saving bus_df1.csv to bus_df1.csv
# df_bus = pd.read_csv('bus_df.csv')
df_bus = pd.read_csv('bus_df1.csv')
df_bus.isna().sum()
business_id 0 name 0 address 5127 city 0 state 0 postal_code 73 latitude 0 longitude 0 stars 0 review_count 0 is_open 0 attributes 13744 categories 103 hours 23223 dtype: int64
df_bus = df_bus.drop(['address', 'postal_code', 'hours'], axis=1)
df_bus = df_bus.dropna(axis=0)
df_bus['categories'] = df_bus['categories'].astype('object')
<ipython-input-159-c570a7e9626a>:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_bus['categories'] = df_bus['categories'].astype('object')
df_bus.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| business_id | Pns2l4eNsfO8kk83dixA6A | mpf3x-BjTdTEA3yCZrAYPw | tUFrWirKiKi_TAnsVWINQQ | MTSW4McQd7CbVtyjqoe9mw | mWMc6_wTdE0EUBKIGXDVfA |
| name | Abby Rappoport, LAC, CMQ | The UPS Store | Target | St Honore Pastries | Perkiomen Valley Brewery |
| city | Santa Barbara | Affton | Tucson | Philadelphia | Green Lane |
| state | CA | MO | AZ | PA | PA |
| latitude | 34.426679 | 38.551126 | 32.223236 | 39.955505 | 40.338183 |
| longitude | -119.711197 | -90.335695 | -110.880452 | -75.155564 | -75.471659 |
| stars | 5.0 | 3.0 | 3.5 | 4.0 | 4.5 |
| review_count | 7 | 15 | 22 | 80 | 13 |
| is_open | 0 | 1 | 0 | 1 | 1 |
| attributes | {'ByAppointmentOnly': 'True'} | {'BusinessAcceptsCreditCards': 'True'} | {'BikeParking': 'True', 'BusinessAcceptsCredit... | {'RestaurantsDelivery': 'False', 'OutdoorSeati... | {'BusinessAcceptsCreditCards': 'True', 'Wheelc... |
| categories | Doctors, Traditional Chinese Medicine, Naturop... | Shipping Centers, Local Services, Notaries, Ma... | Department Stores, Shopping, Fashion, Home & G... | Restaurants, Food, Bubble Tea, Coffee & Tea, B... | Brewpubs, Breweries, Food |
df_bus.describe()
| latitude | longitude | stars | review_count | is_open | |
|---|---|---|---|---|---|
| count | 136601.000000 | 136601.000000 | 136601.000000 | 136601.000000 | 136601.000000 |
| mean | 36.675350 | -89.271785 | 3.623319 | 48.083623 | 0.785858 |
| std | 5.850294 | 14.855670 | 0.943832 | 126.477539 | 0.410227 |
| min | 27.555127 | -120.095137 | 1.000000 | 5.000000 | 0.000000 |
| 25% | 32.192213 | -90.348605 | 3.000000 | 8.000000 | 1.000000 |
| 50% | 38.778279 | -86.120708 | 4.000000 | 16.000000 | 1.000000 |
| 75% | 39.953936 | -75.409757 | 4.500000 | 41.000000 | 1.000000 |
| max | 53.679197 | -73.200457 | 5.000000 | 7568.000000 | 1.000000 |
df_bus.describe(include='object')
| business_id | name | city | state | attributes | categories | |
|---|---|---|---|---|---|---|
| count | 136601 | 136601 | 136601 | 136601 | 136601 | 136601 |
| unique | 136601 | 103432 | 1347 | 27 | 87661 | 78614 |
| top | Pns2l4eNsfO8kk83dixA6A | Starbucks | Philadelphia | PA | {'BusinessAcceptsCreditCards': 'True'} | Beauty & Spas, Nail Salons |
| freq | 1 | 714 | 13399 | 31080 | 9385 | 950 |
#Heat map of all the restaurants
map = folium.Map(location=[39.8283, -98.5795], zoom_start=5)
heat_data = [[row['latitude'],row['longitude']] for index, row in df_bus.iterrows()]
HeatMap(heat_data).add_to(map)
map
df_open = df_bus[df_bus['is_open'] == 1]
df_closed = df_bus[df_bus['is_open'] == 0]
#Heatmap of open restaurants
map = folium.Map(location=[39.8283, -98.5795], zoom_start=5)
heat_data = [[row['latitude'],row['longitude']] for index, row in df_open.iterrows()]
HeatMap(heat_data).add_to(map)
map
#Heat map of closed restaurants
map = folium.Map(location=[39.8283, -98.5795], zoom_start=5)
heat_data = [[row['latitude'],row['longitude']] for index, row in df_closed.iterrows()]
HeatMap(heat_data).add_to(map)
map
print("Count of restaurants that are open {}".format(len(df_open)))
print("Count of restaurants that are closed {}".format(len(df_closed)))
sns.barplot(x=['Open Restaurant', 'Closed Restaurant'], y=[len(df_open), len(df_closed)])
Count of restaurants that are open 107349 Count of restaurants that are closed 29252
<Axes: >
# df = df_bus.groupby(['city', 'is_open']).count()
# df.reset_index(inplace=True)
# df.head()
#Getting the ratio of open restaurants over total restaurants by each state
df = df_bus.groupby('state')['is_open'].sum()/df_bus['state'].value_counts()
df = df.reset_index()
df = df.rename(columns={'index': 'state', 0: 'ratio'})
url = ("https://raw.githubusercontent.com/python-visualization/folium/main/examples/data")
state_geo = f"{url}/us-states.json"
ratio_open_map = folium.Map(location=[48, -102], zoom_start=4)
bins = list(df["ratio"].quantile([0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]))
folium.Choropleth(
geo_data=state_geo,
name="choropleth",
data=df,
columns=["state", 'ratio'],
key_on="feature.id",
fill_color="YlGn",
fill_opacity=0.7,
line_opacity=0.2,
bins = bins,
legend_name="Open Restuarants VS total restuarants (%)",
).add_to(ratio_open_map)
folium.LayerControl().add_to(ratio_open_map)
ratio_open_map
df_bus.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| business_id | Pns2l4eNsfO8kk83dixA6A | mpf3x-BjTdTEA3yCZrAYPw | tUFrWirKiKi_TAnsVWINQQ | MTSW4McQd7CbVtyjqoe9mw | mWMc6_wTdE0EUBKIGXDVfA |
| name | Abby Rappoport, LAC, CMQ | The UPS Store | Target | St Honore Pastries | Perkiomen Valley Brewery |
| city | Santa Barbara | Affton | Tucson | Philadelphia | Green Lane |
| state | CA | MO | AZ | PA | PA |
| latitude | 34.426679 | 38.551126 | 32.223236 | 39.955505 | 40.338183 |
| longitude | -119.711197 | -90.335695 | -110.880452 | -75.155564 | -75.471659 |
| stars | 5.0 | 3.0 | 3.5 | 4.0 | 4.5 |
| review_count | 7 | 15 | 22 | 80 | 13 |
| is_open | 0 | 1 | 0 | 1 | 1 |
| attributes | {'ByAppointmentOnly': 'True'} | {'BusinessAcceptsCreditCards': 'True'} | {'BikeParking': 'True', 'BusinessAcceptsCredit... | {'RestaurantsDelivery': 'False', 'OutdoorSeati... | {'BusinessAcceptsCreditCards': 'True', 'Wheelc... |
| categories | Doctors, Traditional Chinese Medicine, Naturop... | Shipping Centers, Local Services, Notaries, Ma... | Department Stores, Shopping, Fashion, Home & G... | Restaurants, Food, Bubble Tea, Coffee & Tea, B... | Brewpubs, Breweries, Food |
#The data is not well organized, Some categories contain 'restaurants, pizza' other contains 'pizza, restaurants' and they are not treated the same
df = df_bus.categories.unique()
len(df)
78614
df_bus.categories.dtype
dtype('O')
unique_restaurants_list = []
unique_restaurants_list_count = []
for row in df:
if "," in row:
words_list = list(row.split(","))
else:
words_list = list(row)
for word in words_list:
word = word.strip()
word = word.lower()
try:
index = unique_restaurants_list.index(word)
except:
index = -1
if index == -1:
unique_restaurants_list.append(word)
unique_restaurants_list_count.append(1)
else:
unique_restaurants_list_count[index] += 1
df = pd.DataFrame({"Category": unique_restaurants_list,"Count":unique_restaurants_list_count})
#Checking if the incorrect written categories have an effect on the data by checking their count
df[df['Category'].apply(lambda x: len(x) < 2)]
| Category | Count | |
|---|---|---|
| 469 | l | 13 |
| 470 | o | 11 |
| 471 | c | 9 |
| 472 | a | 15 |
| 473 | 18 | |
| 474 | s | 15 |
| 475 | e | 26 |
| 476 | r | 11 |
| 477 | v | 10 |
| 478 | i | 14 |
| 531 | t | 16 |
| 532 | f | 4 |
| 763 | h | 6 |
| 764 | m | 5 |
| 855 | u | 4 |
| 856 | n | 12 |
| 902 | d | 2 |
| 933 | g | 4 |
| 934 | b | 2 |
| 935 | y | 1 |
| 936 | & | 6 |
| 937 | p | 5 |
df = df.sort_values(by='Count', ascending=False)
print(len(df))
df.head()
1326
| Category | Count | |
|---|---|---|
| 17 | restaurants | 31389 |
| 18 | food | 19315 |
| 12 | shopping | 17654 |
| 36 | nightlife | 10452 |
| 34 | bars | 9492 |
#Extracting top 20 categories to mark them as the most popular categories
df = df.iloc[2:]
df = df.head(20)
df.head()
| Category | Count | |
|---|---|---|
| 12 | shopping | 17654 |
| 36 | nightlife | 10452 |
| 34 | bars | 9492 |
| 154 | home services | 8486 |
| 7 | local services | 7572 |
# Define the number of rows to display in each chart
rows_per_chart = 10
# Calculate the number of charts needed
num_charts = -(-len(df) // rows_per_chart) # Equivalent to math.ceil(len(df) / rows_per_chart)
# Define the number of rows and columns for the subplots
nrows = 1
ncols = num_charts
# Create the subplots
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 5))
for i, ax in enumerate(axes):
# Select the data for the current chart
chart_data = df.iloc[i * rows_per_chart:(i + 1) * rows_per_chart]
# Create a bar chart using Seaborn on the current subplot
sns.barplot(y='Category', x='Count', data=chart_data, ax=ax)
# Set the chart title
ax.set_title(f'Chart {i + 1}')
# Adjust the layout
plt.tight_layout()
# Show the charts
plt.show()
#create a column for popular categories
df_bus['popular_category'] = df_bus['categories'].apply(lambda x: 1 if any(substring.lower() in x.lower() for substring in df['Category']) else 0)
df_bus.head()
| business_id | name | city | state | latitude | longitude | stars | review_count | is_open | attributes | categories | popular_category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Pns2l4eNsfO8kk83dixA6A | Abby Rappoport, LAC, CMQ | Santa Barbara | CA | 34.426679 | -119.711197 | 5.0 | 7 | 0 | {'ByAppointmentOnly': 'True'} | Doctors, Traditional Chinese Medicine, Naturop... | 1 |
| 1 | mpf3x-BjTdTEA3yCZrAYPw | The UPS Store | Affton | MO | 38.551126 | -90.335695 | 3.0 | 15 | 1 | {'BusinessAcceptsCreditCards': 'True'} | Shipping Centers, Local Services, Notaries, Ma... | 1 |
| 2 | tUFrWirKiKi_TAnsVWINQQ | Target | Tucson | AZ | 32.223236 | -110.880452 | 3.5 | 22 | 0 | {'BikeParking': 'True', 'BusinessAcceptsCredit... | Department Stores, Shopping, Fashion, Home & G... | 1 |
| 3 | MTSW4McQd7CbVtyjqoe9mw | St Honore Pastries | Philadelphia | PA | 39.955505 | -75.155564 | 4.0 | 80 | 1 | {'RestaurantsDelivery': 'False', 'OutdoorSeati... | Restaurants, Food, Bubble Tea, Coffee & Tea, B... | 1 |
| 4 | mWMc6_wTdE0EUBKIGXDVfA | Perkiomen Valley Brewery | Green Lane | PA | 40.338183 | -75.471659 | 4.5 | 13 | 1 | {'BusinessAcceptsCreditCards': 'True', 'Wheelc... | Brewpubs, Breweries, Food | 0 |
#Ratio of places that has categories within the top 20 most popular catgories
df_bus['popular_category'].sum()/len(df_bus)
0.823244339353299
df = pd.DataFrame(df_bus.groupby(['is_open','popular_category'])['popular_category'].agg('count'))
df = df.rename(columns={'popular_category': 'Category_count'})
df = df.reset_index()
df
| is_open | popular_category | Category_count | |
|---|---|---|---|
| 0 | 0 | 0 | 8097 |
| 1 | 0 | 1 | 21155 |
| 2 | 1 | 0 | 16048 |
| 3 | 1 | 1 | 91301 |
#barplot between is it open or not based on ctegory
sns.barplot(x='is_open', y='Category_count', hue='popular_category', data=df)
<Axes: xlabel='is_open', ylabel='Category_count'>
df_bus.iloc[0].attributes
df_bus.attributes.unique()
array(["{'ByAppointmentOnly': 'True'}",
"{'BusinessAcceptsCreditCards': 'True'}",
'{\'BikeParking\': \'True\', \'BusinessAcceptsCreditCards\': \'True\', \'RestaurantsPriceRange2\': \'2\', \'CoatCheck\': \'False\', \'RestaurantsTakeOut\': \'False\', \'RestaurantsDelivery\': \'False\', \'Caters\': \'False\', \'WiFi\': "u\'no\'", \'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'WheelchairAccessible\': \'True\', \'HappyHour\': \'False\', \'OutdoorSeating\': \'False\', \'HasTV\': \'False\', \'RestaurantsReservations\': \'False\', \'DogsAllowed\': \'False\', \'ByAppointmentOnly\': \'False\'}',
...,
'{\'BusinessAcceptsCreditCards\': \'True\', \'RestaurantsPriceRange2\': \'2\', \'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'BikeParking\': \'True\', \'WiFi\': "u\'no\'"}',
'{\'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'BikeParking\': \'True\', \'RestaurantsPriceRange2\': \'4\', \'BusinessAcceptsCreditCards\': \'True\', \'RestaurantsTakeOut\': \'None\', \'RestaurantsDelivery\': \'None\'}',
'{\'WheelchairAccessible\': \'True\', \'BusinessAcceptsBitcoin\': \'False\', \'RestaurantsPriceRange2\': \'1\', \'BusinessAcceptsCreditCards\': \'True\', \'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'BikeParking\': \'False\', \'WiFi\': "u\'free\'", \'ByAppointmentOnly\': \'False\'}'],
dtype=object)
#extracting features
import ast
unique_values = []
for index, row in df_bus.iterrows():
dictionary = ast.literal_eval(row.attributes)
for (key, value) in dictionary.items():
key = key.lower()
if (value == 'True'):
if key not in unique_values:
unique_values.append(key)
df_bus[key]= 0
df_bus.loc[index,key]=1
elif (value == 'False' or value == 'None'):
if key not in unique_values:
unique_values.append(key)
df_bus[key]= 0
else:
try:
dictionary1 = ast.literal_eval(value)
for (key1, value1) in dictionary1.items():
key1 = key1.lower()
if (value == 'True'):
if key1 not in unique_values:
unique_values.append(key1)
df_bus[key1]= 0
df_bus.loc[index,key1]=1
elif (value == 'False' or value == 'None'):
if key1 not in unique_values:
unique_values.append(key1)
df_bus[key1]= 0
except:
continue
df_bus.head()
| business_id | name | city | state | latitude | longitude | stars | review_count | is_open | attributes | ... | noiselevel | open24hours | restaurantscounterservice | bestnights | hairspecializesin | music | alcohol | restaurantsattire | byobcorkage | dietaryrestrictions | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Pns2l4eNsfO8kk83dixA6A | Abby Rappoport, LAC, CMQ | Santa Barbara | CA | 34.426679 | -119.711197 | 5.0 | 7 | 0 | {'ByAppointmentOnly': 'True'} | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | mpf3x-BjTdTEA3yCZrAYPw | The UPS Store | Affton | MO | 38.551126 | -90.335695 | 3.0 | 15 | 1 | {'BusinessAcceptsCreditCards': 'True'} | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | tUFrWirKiKi_TAnsVWINQQ | Target | Tucson | AZ | 32.223236 | -110.880452 | 3.5 | 22 | 0 | {'BikeParking': 'True', 'BusinessAcceptsCredit... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | MTSW4McQd7CbVtyjqoe9mw | St Honore Pastries | Philadelphia | PA | 39.955505 | -75.155564 | 4.0 | 80 | 1 | {'RestaurantsDelivery': 'False', 'OutdoorSeati... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | mWMc6_wTdE0EUBKIGXDVfA | Perkiomen Valley Brewery | Green Lane | PA | 40.338183 | -75.471659 | 4.5 | 13 | 1 | {'BusinessAcceptsCreditCards': 'True', 'Wheelc... | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 50 columns
df = df_bus.groupby('is_open')['stars'].mean()
df = df.reset_index()
# Create batchart
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.barplot(x='is_open', y='stars', data=df)
<Axes: xlabel='is_open', ylabel='stars'>
df = df_bus.drop(['business_id', 'name', 'city', 'state', 'latitude', 'longitude', 'attributes', 'categories'], axis=1)
for column in df.columns:
if (df[column].sum()==0):
df = df.drop([column], axis=1)
# Calculate the correlation matrix
corr_matrix = df.corr()
plt.figure(figsize=(10, 8))
# Create a correlation heatmap using Seaborn
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
# Set the title of the heatmap
plt.title('Correlation Heatmap')
# Display the heatmap
plt.show()